Primary Vignette - Classification Strategies
Objectives
Perform exploratory data analysis
Reduce dimensionality using principal component analysis
Employ logistic regression using
glm()andmultinom()Building a random forest model using cross-validation
We’ll illustrate these strategies using the California Household Travel Survey (CHTS) dataset.

Data Cleaning
First, we want to load any necessary packages and our 3 datasets into R Studio. We will then merge all 3 datasets into 1 dataframe and convert variables into factors as needed while also filtering out numeric variables that have numbers that represent NAs.
# Loading necessary packages
library(tidyverse)
library(ggplot2)
library(dplyr)
library(tidymodels)
library(sparsesvd)
library(nnet)
library(sf)
library(mapview)
mapviewOptions(fgb = FALSE)
library(leafsync)
library(maps)
library(nnet)
library(randomForest)
library(vip)
library(SparseM)
library(Matrix)
library(kableExtra)
# Read in datasets
PersonData <- read_rds('./Data/raw/PersonData_111A.Rds')
HHData <- read_rds('./Data/raw/HHData_111A.Rds')
hh_bgDensity <- read_rds('./Data/raw/hh_bgDensity.Rds')
county_shp <- st_read("./Data/raw/counties/counties.shp")Reading layer `counties' from data source
`/Users/valeriedelafuente/Documents/Capstone/vignette-group3/Data/raw/counties/counties.shp'
using driver `ESRI Shapefile'
Simple feature collection with 58 features and 17 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -124.3926 ymin: 32.53578 xmax: -114.1312 ymax: 42.00952
Geodetic CRS: WGS 84
# Merge datasets
personHHData <- left_join(PersonData, HHData) %>%
left_join(hh_bgDensity)
data <- personHHData %>%
filter(WorkDaysWk != 8 , WorkDaysWk !=9,
TypicalHoursWk != 998, TypicalHoursWk!= 999,
TransitTripsWk != 98, TransitTripsWk != 99,
WalkTripsWk != 98, WalkTripsWk != 99,
BikeTripsWk != 98, BikeTripsWk != 99) %>%
mutate(HH_anyTransitRider = as.factor(HH_anyTransitRider),
HH_homeType = as.factor(HH_homeType),
HH_homeowner = as.factor(HH_homeowner),
HH_isHispanic = as.factor(HH_isHispanic),
HH_intEnglish = as.factor(HH_intEnglish),
HH_income = as.factor(HH_income),
Male = as.factor(Male),
persWhite = as.factor(persWhite),
persHisp = as.factor(persHisp),
persAfricanAm = as.factor(persAfricanAm),
persNativeAm = as.factor(persNativeAm),
persAsian = as.factor(persAsian),
persPacIsl = as.factor(persPacIsl),
persOthr = as.factor(persOthr),
persDKrace = as.factor(persDKrace),
persRFrace = as.factor(persRFrace),
bornUSA = as.factor(bornUSA),
DriverLic = as.factor(DriverLic),
TransitPass = as.factor(TransitPass),
Employed = as.factor(Employed),
WorkFixedLoc = as.factor(WorkFixedLoc),
WorkHome =as.factor(WorkHome),
WorkNonfixed = as.factor(WorkNonfixed),
FlexSched = as.factor(FlexSched),
FlexPrograms = as.factor(FlexPrograms),
Disability = as.factor(Disability),
DisLicensePlt = as.factor(DisLicensePlt),
Student = as.factor(Student),
workday = as.factor(workday),
hhid = paste(hhid, pnum),
SchoolMode = as.factor(SchoolMode),
WorkMode = as.factor(WorkMode),
EducationCompl = as.factor(EducationCompl)) %>%
select(-pnum)Next, we will preprocess our data to be ready for PCA by selecting only the numeric columns and scaling the data. Then, we will want to add back in our response variable column, bg_group, and our ID column, hhid, to the scaled data. Lastly, we want to remove any NA values from the data, since PCA cannot handle them.
# Preprocess data
numeric_columns <- sapply(data, is.numeric)
numeric_data <- data[, numeric_columns]
numeric_data <- numeric_data %>% select(-CTFIP, -bg_density)
scaled_data <- scale(numeric_data)
hhid <- data$hhid
bg_group <- as.data.frame(data$bg_group)
scaled_data <- cbind(hhid, bg_group, scaled_data) %>%
mutate(bg_group = data$bg_group)
scaled_data_clean <- na.omit(scaled_data) %>%
as.data.frame() %>%
select(-`data$bg_group`)Exploratory Data Analysis
Before we partition our data and fit classification models, we want to get to know our data through some exploratory visualizations. Since our data is geographically structured, we found it helpful to visualize our data on a map.
Static Map
county_bg_aggreg <- data %>%
group_by(County, CTFIP, bg_group) %>% # group by county, CTFIP, and also bg_group
mutate(count = n()) %>%
summarise_at(vars(-hhid), mean)
county_bg_shp <- county_shp %>%
merge(data.frame(bg_group = c("Urban", "Suburban", "Exurban", "Rural"))) %>%
left_join(county_bg_aggreg)
# get the CA county data
county <- ggplot2::map_data("county", region = "california")
county_bg <- merge(county, data.frame(bg_group = c("Urban", "Suburban", "Exurban", "Rural")))
county_bg_all <- county_bg_aggreg %>%
mutate(subregion = tolower(County)) %>%
full_join(county_bg, by = c("subregion", "bg_group"))
ggplot(county_bg_all) +
geom_polygon(aes(x = long, y = lat, group = subregion, fill = Sum_PMT), colour = "white") +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
facet_wrap(vars(bg_group), nrow = 2) + # multi-panel plots using facet_wrap(), plot in 2 rows
ggtitle("Total PMT in California at County-level") +
theme_void() +
theme(legend.position="bottom")
These maps show the total distance traveled by country. The grey counties represent missing data. As we can see, the northern areas of suburban and urban maps have the most missing data. However, all 4 maps have around the same amount of distance traveled. The map with the darkest areas (indicating most miles traveled) is exurban.
Sum Trips by Residential Area
urban_TripMap <- mapview(filter(county_bg_shp, bg_group == "Urban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Urban Trips")
suburb_TripMap <- mapview(filter(county_bg_shp, bg_group == "Suburban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Suburban Trips")
exurb_TripMap <- mapview(filter(county_bg_shp, bg_group == "Exurban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Exurban Trips")
rural_TripMap <- mapview(filter(county_bg_shp, bg_group == "Rural"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Rural Trips")
latticeview(urban_TripMap, suburb_TripMap, exurb_TripMap, rural_TripMap, sync = "all")